{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/yaojiarui/miniconda3/envs/MoE/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "from tqdm import tqdm\n",
    "import json\n",
    "import jsonlines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['http_proxy'] = 'http://127.0.0.1:7890'\n",
    "os.environ['https_proxy'] = 'http://127.0.0.1:7890'\n",
    "os.environ['all_proxy'] = 'socks5://127.0.0.1:7890'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_parity(re):\n",
    "    for item in re['instances']:\n",
    "        for i, message in enumerate(item['messages']):\n",
    "            if i % 2:\n",
    "                assert message['role'] == 'assistant'\n",
    "            else:\n",
    "                assert message['role'] == 'user'\n",
    "\n",
    "def check_parity_ob(re):\n",
    "    for item in re['instances']:\n",
    "        for i, message in enumerate(item['messages']):\n",
    "            if i % 4 == 0:\n",
    "                assert message['role'] == 'user'\n",
    "            elif i % 4 == 1:\n",
    "                assert message['role'] == 'function'\n",
    "            elif i % 4 == 2:\n",
    "                assert message['role'] == 'observation'\n",
    "            else:\n",
    "                assert message['role'] == 'assistant'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_names = [\n",
    "    'glaiveai/glaive-function-calling-v2',\n",
    "    'Salesforce/xlam-function-calling-60k',\n",
    "    'Gorilla OpenFunctions-v2',\n",
    "    'Gorilla OpenFunctions-v1',\n",
    "    'ise-uiuc/Magicoder-OSS-Instruct-75K',\n",
    "    'RLHFlow/CodeUltraFeedback-standard',\n",
    "    'codeparrot/apps',\n",
    "    'meta-math/MetaMathQA',\n",
    "    # 'TIGER-Lab/MathInstruct',\n",
    "    'camel-ai/math',\n",
    "    # 'xinlai/Math-Step-DPO-10K',\n",
    "    'openai/gsm8k'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 112960/112960 [00:03<00:00, 33582.18it/s]\n"
     ]
    }
   ],
   "source": [
    "# process glaiveai/glaive-function-calling-v2\n",
    "ds = load_dataset('glaiveai/glaive-function-calling-v2')\n",
    "re = {\"type\": \"conversation\", \"instances\": []}\n",
    "for i, item in enumerate(tqdm(ds['train'])):\n",
    "    system = item['system'].lstrip('SYSTEM:').strip()\n",
    "    chat = item['chat']\n",
    "    instantce = {\"conversation_id\": i, \"system\": system, \"messages\": []}\n",
    "    user_assistant_pairs = chat.split('USER: ')[1:]\n",
    "    for pair in user_assistant_pairs:\n",
    "        turn = pair.split('ASSISTANT: ')\n",
    "        message = {\"role\": \"user\", \"content\": turn[0].strip()}\n",
    "        instantce['messages'].append(message)\n",
    "        # only user and assistant\n",
    "        assistant_message = ''\n",
    "        for i in range(1, len(turn)):\n",
    "            assistant_message += turn[i].strip()\n",
    "        instantce['messages'].append({\"role\": \"assistant\", \"content\": assistant_message})        \n",
    "\n",
    "    re['instances'].append(instantce)\n",
    "\n",
    "with open('data/glaive-function-calling-v2.json', 'w') as f:\n",
    "    json.dump(re, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 112960/112960 [00:04<00:00, 27512.44it/s]\n"
     ]
    }
   ],
   "source": [
    "# process glaiveai/glaive-function-calling-v2 in sharegpt form\n",
    "ds = load_dataset('glaiveai/glaive-function-calling-v2')\n",
    "re = {\"type\": \"conversation\", \"instances\": []}\n",
    "for i, item in enumerate(tqdm(ds['train'])):\n",
    "    system = item['system'].lstrip('SYSTEM:').strip()\n",
    "    chat = item['chat']\n",
    "    instantce = {\"conversation_id\": i, \"system\": system, \"messages\": []}\n",
    "    user_assistant_pairs = chat.split('USER: ')[1:]\n",
    "    for pair in user_assistant_pairs:\n",
    "        turn = pair.split('ASSISTANT: ')\n",
    "        message = {\"role\": \"user\", \"content\": turn[0].strip()}\n",
    "        instantce['messages'].append(message)\n",
    "        done = False\n",
    "        for i in range(1, len(turn)):\n",
    "            try:\n",
    "                call, ob = turn[i].split('FUNCTION RESPONSE: ')\n",
    "                instantce['messages'].append({\"role\": \"function\", \"content\": call.strip().lstrip('<functioncall>').rstrip('<|endoftext|>').strip()})\n",
    "                instantce['messages'].append({\"role\": \"observation\", \"content\": ob.strip()})\n",
    "                assistant_message = ''\n",
    "                for j in range(1, len(turn)):\n",
    "                    if j == i:\n",
    "                        continue\n",
    "                    assistant_message += turn[j].strip()\n",
    "                instantce['messages'].append({\"role\": \"assistant\", \"content\": assistant_message})\n",
    "                done = True\n",
    "                break\n",
    "            except:\n",
    "                continue\n",
    "\n",
    "        if not done:\n",
    "            if len(turn) == 1:\n",
    "                instantce['messages'].append({\"role\": \"function\", \"content\": ''})\n",
    "                instantce['messages'].append({\"role\": \"observation\", \"content\": ''})\n",
    "                instantce['messages'].append({\"role\": \"assistant\", \"content\": ''})\n",
    "            elif len(turn) == 2:\n",
    "                instantce['messages'].append({\"role\": \"function\", \"content\": ''})\n",
    "                instantce['messages'].append({\"role\": \"observation\", \"content\": ''})\n",
    "                instantce['messages'].append({\"role\": \"assistant\", \"content\": turn[1].strip()})\n",
    "            else:\n",
    "                print(turn)\n",
    "                raise Exception(\"Invalid chat\")\n",
    "\n",
    "        # if len(turn) == 1:\n",
    "        #     instantce['messages'].append({\"role\": \"function\", \"content\": ''})\n",
    "        #     instantce['messages'].append({\"role\": \"observation\", \"content\": ''})\n",
    "        #     instantce['messages'].append({\"role\": \"assistant\", \"content\": ''})\n",
    "        # elif len(turn) == 2:\n",
    "        #     instantce['messages'].append({\"role\": \"function\", \"content\": ''})\n",
    "        #     instantce['messages'].append({\"role\": \"observation\", \"content\": ''})\n",
    "        #     instantce['messages'].append({\"role\": \"assistant\", \"content\": turn[1].strip()})\n",
    "        # elif len(turn) == 3:\n",
    "        #     try:\n",
    "        #         call, ob = turn[1].split('FUNCTION RESPONSE: ')\n",
    "        #         instantce['messages'].append({\"role\": \"function\", \"content\": call.strip().lstrip('<functioncall>').rstrip('<|endoftext|>').strip()})\n",
    "        #         instantce['messages'].append({\"role\": \"observation\", \"content\": ob.strip()})\n",
    "        #         instantce['messages'].append({\"role\": \"assistant\", \"content\": turn[2].strip()})\n",
    "        #     except:\n",
    "        #         call, ob = turn[2].split('FUNCTION RESPONSE: ')\n",
    "        #         instantce['messages'].append({\"role\": \"function\", \"content\": call.strip().lstrip('<functioncall>').rstrip('<|endoftext|>').strip()})\n",
    "        #         instantce['messages'].append({\"role\": \"observation\", \"content\": ob.strip()})\n",
    "        #         instantce['messages'].append({\"role\": \"assistant\", \"content\": turn[1].strip()})\n",
    "        # elif len(turn) == 4:\n",
    "        #     call, ob = turn[2].split('FUNCTION RESPONSE: ')\n",
    "        #     instantce['messages'].append({\"role\": \"function\", \"content\": call.strip().lstrip('<functioncall>').rstrip('<|endoftext|>').strip()})\n",
    "        #     instantce['messages'].append({\"role\": \"observation\", \"content\": ob.strip()})\n",
    "        #     instantce['messages'].append({\"role\": \"assistant\", \"content\": turn[1].strip()+turn[3].strip()})\n",
    "        # elif len(turn) == 5:\n",
    "        #     call, ob = turn[1].split('FUNCTION RESPONSE: ')\n",
    "        #     instantce['messages'].append({\"role\": \"function\", \"content\": call.strip().lstrip('<functioncall>').rstrip('<|endoftext|>').strip()})\n",
    "        #     instantce['messages'].append({\"role\": \"observation\", \"content\": ob.strip()})\n",
    "        #     instantce['messages'].append({\"role\": \"assistant\", \"content\": turn[1].strip()+turn[3].strip()})\n",
    "        # else:\n",
    "        #     print(turn)\n",
    "        #     raise Exception(\"Invalid chat\")\n",
    "\n",
    "        # # only user and assistant\n",
    "        # assistant_message = ''\n",
    "        # for i in range(1, len(turn)):\n",
    "        #     assistant_message += turn[i].strip()\n",
    "        # instantce['messages'].append({\"role\": \"assistant\", \"content\": assistant_message})        \n",
    "\n",
    "    re['instances'].append(instantce)\n",
    "\n",
    "with open('data/glaive-function-calling-v2-sharegpt.json', 'w') as f:\n",
    "    json.dump(re, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "chat = ds['train'][1]['chat']\n",
    "pairs = chat.split('USER: ')[1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['<functioncall> {\"name\": \"get_news_headlines\", \"arguments\": \\'{\"country\": \"United States\"}\\'} <|endoftext|>\\n\\n\\n',\n",
       " '{\"headlines\": [\"Biden announces new vaccine mandates\", \"Hurricane Ida devastates Louisiana\", \"Apple unveils new iPhone\", \"NASA\\'s Perseverance rover collects first Mars rock sample\"]}\\n\\n\\n']"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairs[0].split('ASSISTANT: ')[1].split('FUNCTION RESPONSE: ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# process hiyouga/glaive-function-calling-v2-sharegpt\n",
    "# ds = load_dataset('hiyouga/glaive-function-calling-v2-sharegpt')\n",
    "# 数据并不都是 {user, assistant}或{user, function, observation, assistant}的格式\n",
    "\n",
    "# re = {\"type\": \"conversation\", \"instances\": []}\n",
    "# for i, item in enumerate(tqdm(ds['train'])):\n",
    "#     instantce = {\"conversation_id\": i, \"system\": '', \"tools\": item['tools'], \"messages\": []}\n",
    "#     chat = item['conversations']\n",
    "#     index = 0\n",
    "#     done = False\n",
    "#     while index < len(chat):\n",
    "#         if chat[index]['from'] == 'human':\n",
    "#             instantce['messages'].append({\"role\": \"user\", \"content\": chat[index]['value']})\n",
    "#             if chat[index+1]['from'] == 'gpt':\n",
    "#                 instantce['messages'].append({\"role\": \"function\", \"content\": ''})\n",
    "#                 instantce['messages'].append({\"role\": \"observation\", \"content\": ''})\n",
    "#                 instantce['messages'].append({\"role\": \"assistant\", \"content\": chat[index+1]['value']})\n",
    "#                 index += 2\n",
    "#             else:\n",
    "#                 instantce['messages'].append({\"role\": \"function\", \"content\": chat[index+1]['value']})\n",
    "#                 instantce['messages'].append({\"role\": \"observation\", \"content\": chat[index+2]['value']})\n",
    "#                 instantce['messages'].append({\"role\": \"assistant\", \"content\": chat[index+3]['value']})\n",
    "#                 index += 4\n",
    "#         else:\n",
    "#             print(chat)\n",
    "#             done = True\n",
    "#             break\n",
    "#     if done:\n",
    "#         break\n",
    "#     re['instances'].append(instantce)\n",
    "\n",
    "# with open('data/glaive-function-calling-v2-sharegpt.json', 'w') as f:\n",
    "#     json.dump(re, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 60000/60000 [00:03<00:00, 18310.08it/s]\n"
     ]
    }
   ],
   "source": [
    "# process Salesforce/xlam-function-calling-60k\n",
    "ds = load_dataset('Salesforce/xlam-function-calling-60k')\n",
    "re = {\"type\": \"conversation\", \"instances\": []}\n",
    "for i, item in enumerate(tqdm(ds['train'])):\n",
    "    system = ''\n",
    "    messages = [{\"role\": \"user\", \"content\": item['query']}, {\"role\": \"assistant\", \"content\": item['answers']}]\n",
    "    instantce = {\"conversation_id\": i, \"system\": system, \"tools\": [str(tool) for tool in json.loads(item['tools'])], \"messages\": messages}\n",
    "    re['instances'].append(instantce)\n",
    "\n",
    "with open('data/xlam-function-calling-60k.json', 'w') as f:\n",
    "    json.dump(re, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# process gorilla/data/apibench\n",
    "api_path = 'gorilla/data/api'\n",
    "huggingface_api = []\n",
    "with jsonlines.open(f'{api_path}/huggingface_api.jsonl', 'r') as f:\n",
    "    for line in f:\n",
    "        huggingface_api.append(line)\n",
    "tensorflowhub_api = []\n",
    "with jsonlines.open(f'{api_path}/tensorflowhub_api.jsonl', 'r') as f:\n",
    "    for line in f:\n",
    "        tensorflowhub_api.append(line)\n",
    "torchhub_api = []\n",
    "with jsonlines.open(f'{api_path}/torchhub_api.jsonl', 'r') as f:\n",
    "    for line in f:\n",
    "        torchhub_api.append(line)\n",
    "\n",
    "apibench_path = 'gorilla/data/apibench'\n",
    "huggingface_train = []\n",
    "with jsonlines.open(f'{apibench_path}/huggingface_train.json', 'r') as f:\n",
    "    for line in f:\n",
    "        huggingface_train.append(line)\n",
    "huggingface_eval = []\n",
    "with jsonlines.open(f'{apibench_path}/huggingface_eval.json', 'r') as f:\n",
    "    for line in f:\n",
    "        huggingface_eval.append(line)\n",
    "tensorflow_train = []\n",
    "with jsonlines.open(f'{apibench_path}/tensorflow_train.json', 'r') as f:\n",
    "    for line in f:\n",
    "        tensorflow_train.append(line)\n",
    "tensorflow_eval = []\n",
    "with jsonlines.open(f'{apibench_path}/tensorflow_eval.json', 'r') as f:\n",
    "    for line in f:\n",
    "        tensorflow_eval.append(line)\n",
    "torchhub_train = []\n",
    "with jsonlines.open(f'{apibench_path}/torchhub_train.json', 'r') as f:\n",
    "    for line in f:\n",
    "        torchhub_train.append(line)\n",
    "torchhub_eval = []\n",
    "with jsonlines.open(f'{apibench_path}/torchhub_eval.json', 'r') as f:\n",
    "    for line in f:\n",
    "        torchhub_eval.append(line)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/8191 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 8191/8191 [00:00<00:00, 39924.03it/s]\n",
      "100%|██████████| 911/911 [00:00<00:00, 93836.22it/s]\n",
      "100%|██████████| 6190/6190 [00:00<00:00, 126633.93it/s]\n",
      "100%|██████████| 688/688 [00:00<00:00, 125622.79it/s]\n",
      "  0%|          | 0/837 [00:00<?, ?it/s]\n",
      "100%|██████████| 186/186 [00:00<00:00, 76952.12it/s]\n"
     ]
    }
   ],
   "source": [
    "dss = {'huggingface_train': huggingface_train,\n",
    "       'huggingface_eval': huggingface_eval,\n",
    "       'tensorflow_train': tensorflow_train,\n",
    "       'tensorflow_eval': tensorflow_eval,\n",
    "       'torchhub_train': torchhub_train,\n",
    "       'torchhub_eval': torchhub_eval}\n",
    "def process_api(api_name, ds):\n",
    "    re = {\"type\": \"conversation\", \"instances\": []}\n",
    "    for i, item in enumerate(tqdm(dss[ds])):\n",
    "        system = ''\n",
    "        turn = item['code'].split('###')[1:]\n",
    "        messages = []\n",
    "        user_message = ''\n",
    "        for j in range(0, len(turn)-1):\n",
    "            # messages.append({\"role\": \"user\", \"content\": turn[j].strip()})\n",
    "            user_message += turn[j].strip()\n",
    "        messages.append({\"role\": \"user\", \"content\": user_message})\n",
    "        messages.append({\"role\": \"assistant\", \"content\": turn[-1].strip()})\n",
    "        instantce = {\"conversation_id\": i, \"system\": system, \"tools\": [json.dumps(item['api_data'])], \"messages\": messages}\n",
    "        re['instances'].append(instantce)\n",
    "\n",
    "    with open(f'data/gorilla_apibench/{ds}.json', 'w') as f:\n",
    "        json.dump(re, f, indent=4)\n",
    "\n",
    "for k in dss:\n",
    "    if 'huggingface' in k:\n",
    "        api_name = 'huggingface_api'\n",
    "    elif 'tensorflow' in k:\n",
    "        api_name = 'tensorflowhub_api'\n",
    "    elif 'torchhub' in k:\n",
    "        api_name = 'torchhub_api'\n",
    "    try:\n",
    "        process_api(api_name, k)\n",
    "    except:\n",
    "        continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 12125/12125 [00:00<00:00, 318843.99it/s]\n"
     ]
    }
   ],
   "source": [
    "# process gorilla openfunctions-v1\n",
    "gorilla_openfunctions_v1 = []\n",
    "with jsonlines.open('gorilla/openfunctions/openfunctions-v1/gorilla_openfunctions_v1_train.json', 'r') as f:\n",
    "    for line in f:\n",
    "        gorilla_openfunctions_v1.append(line)\n",
    "\n",
    "re = {\"type\": \"conversation\", \"instances\": []}\n",
    "\n",
    "for i, item in enumerate(tqdm(gorilla_openfunctions_v1)):\n",
    "    system = ''\n",
    "    messages = [{\"role\": \"user\", \"content\": item['Instruction']}, {\"role\": \"assistant\", \"content\": '\\n'.join(item['Output'])}]\n",
    "    instantce = {\"conversation_id\": i, \"system\": system, \"tools\": item['Functions'], \"messages\": messages}\n",
    "    re['instances'].append(instantce)\n",
    "\n",
    "with open('data/gorilla_openfunctions_v1_train.json', 'w') as f:\n",
    "    json.dump(re, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 75197/75197 [00:03<00:00, 22049.76it/s]\n"
     ]
    }
   ],
   "source": [
    "# process ise-uiuc/Magicoder-OSS-Instruct-75K\n",
    "ds = load_dataset('ise-uiuc/Magicoder-OSS-Instruct-75K')\n",
    "re = {\"type\": \"conversation\", \"instances\": []}\n",
    "for i, item in enumerate(tqdm(ds['train'])):\n",
    "    system = ''\n",
    "    messages = [{\"role\": \"user\", \"content\": item['problem']}, {\"role\": \"assistant\", \"content\": item['solution']}]\n",
    "    instantce = {\"conversation_id\": i, \"system\": system, \"messages\": messages}\n",
    "    re['instances'].append(instantce)\n",
    "\n",
    "with open('data/Magicoder-OSS-Instruct-75K.json', 'w') as f:\n",
    "    json.dump(re, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using the latest cached version of the dataset since RLHFlow/CodeUltraFeedback-standard couldn't be found on the Hugging Face Hub\n",
      "Found the latest cached dataset configuration 'default' at /home/yaojiarui/.cache/huggingface/datasets/RLHFlow___code_ultra_feedback-standard/default/0.0.0/bcb5b923b9d02e6da2b410768fe22f91dfc450db (last modified on Thu Aug  8 18:06:49 2024).\n",
      "100%|██████████| 50156/50156 [00:02<00:00, 19835.57it/s]\n"
     ]
    }
   ],
   "source": [
    "# process RLHFlow/CodeUltraFeedback-standard\n",
    "ds = load_dataset('RLHFlow/CodeUltraFeedback-standard')\n",
    "re = {\"type\": \"conversation\", \"instances\": []}\n",
    "last_chosen = None\n",
    "for i, item in enumerate(tqdm(ds['train'])):\n",
    "    system = ''\n",
    "    messages = item['chosen']\n",
    "    if last_chosen and messages == last_chosen:\n",
    "        continue\n",
    "    last_chosen = messages\n",
    "    instantce = {\"conversation_id\": i, \"system\": system, \"messages\": messages}\n",
    "    re['instances'].append(instantce)\n",
    "\n",
    "with open('data/CodeUltraFeedback-standard.json', 'w') as f:\n",
    "    json.dump(re, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading readme: 100%|██████████| 4.45k/4.45k [00:00<00:00, 11.8MB/s]\n",
      "Downloading data: 100%|██████████| 396M/396M [01:57<00:00, 3.37MB/s] \n",
      "Generating train split: 100%|██████████| 395000/395000 [00:09<00:00, 42574.12 examples/s]\n"
     ]
    }
   ],
   "source": [
    "# process meta-math/MetaMathQA\n",
    "ds = load_dataset('meta-math/MetaMathQA')\n",
    "re = {\"type\": \"conversation\", \"instances\": []}\n",
    "for i, item in enumerate(tqdm(ds['train'])):\n",
    "    system = ''\n",
    "    messages = [{\"role\": \"user\", \"content\": item['query']}, {\"role\": \"assistant\", \"content\": item['response']}]\n",
    "    instantce = {\"conversation_id\": i, \"system\": system, \"messages\": messages}\n",
    "    re['instances'].append(instantce)\n",
    "\n",
    "with open('data/MetaMathQA.json', 'w') as f:\n",
    "    json.dump(re, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# process openai/gsm8k\n",
    "ds = load_dataset('openai/gsm8k', 'main')\n",
    "re = {\"type\": \"conversation\", \"instances\": []}\n",
    "for i, item in enumerate(tqdm(ds['train'])):\n",
    "    system = ''\n",
    "    messages = [{\"role\": \"user\", \"content\": item['question']}, {\"role\": \"assistant\", \"content\": item['answer']}]\n",
    "    instantce = {\"conversation_id\": i, \"system\": system, \"messages\": messages}\n",
    "    re['instances'].append(instantce)\n",
    "\n",
    "with open('data/gsm8k.json', 'w') as f:\n",
    "    json.dump(re, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# process camel-ai/math\n",
    "ds = load_dataset('camel-ai/math')\n",
    "re = {\"type\": \"conversation\", \"instances\": []}\n",
    "for i, item in enumerate(tqdm(ds['train'])):\n",
    "    system = ''\n",
    "    messages = [{\"role\": \"user\", \"content\": item['message_1']}, {\"role\": \"assistant\", \"content\": item['message_2']}]\n",
    "    instantce = {\"conversation_id\": i, \"system\": system, \"messages\": messages}\n",
    "    re['instances'].append(instantce)\n",
    "\n",
    "with open('data/camel-ai-math.json', 'w') as f:\n",
    "    json.dump(re, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# process TIGER-Lab/MathInstruct\n",
    "ds = load_dataset('TIGER-Lab/MathInstruct')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1726/1726 [00:00<00:00, 24064.89it/s]\n"
     ]
    }
   ],
   "source": [
    "# process gen-bfcl-filtered\n",
    "data = []\n",
    "with jsonlines.open('data/autoGen/gen-bfcl-filtered.json', 'r') as f:\n",
    "    for line in f:\n",
    "        data.append(line)\n",
    "re = {\"type\": \"conversation\", \"instances\": []}\n",
    "ids = 0\n",
    "for i, item in enumerate(tqdm(data)):\n",
    "    for dic in item:\n",
    "        system = ''\n",
    "        messages = [{\"role\": \"user\", \"content\": dic['instruction']}, {\"role\": \"assistant\", \"content\": dic['output']['code']+'\\n\\n'+dic['output']['explanation']}]\n",
    "        instantce = {\"conversation_id\": ids, \"system\": system, \"tools\": [str(dic['api_data'])], \"messages\": messages}\n",
    "        re['instances'].append(instantce)\n",
    "        ids += 1\n",
    "\n",
    "with open('data/autoGen/gen-bfcl-filtered.json', 'w') as f:\n",
    "    json.dump(re, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading builder script: 100%|██████████| 4.95k/4.95k [00:00<00:00, 20.8MB/s]\n",
      "Downloading readme: 100%|██████████| 5.63k/5.63k [00:00<00:00, 5.10MB/s]\n",
      "Downloading data: 100%|██████████| 107M/107M [00:17<00:00, 6.12MB/s]   \n",
      "Downloading data: 100%|██████████| 1.29G/1.29G [04:33<00:00, 4.73MB/s]   \n",
      "Generating train split: 5000 examples [00:00, 7050.91 examples/s]\n",
      "Generating test split: 5000 examples [00:07, 635.31 examples/s] \n"
     ]
    }
   ],
   "source": [
    "ds = load_dataset('codeparrot/apps')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MoE",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
